__author__ = 'maesleung'
# -*- coding:utf-8 -*-
from datetime import datetime
from bs4 import BeautifulSoup

import urllib.request,urllib.parse
import re
import http.cookiejar
import json
import codecs

def getHtml(url):
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-Agent',
                          'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'),
                         ('Cookie', '2c7505bca2e54d1e85df92d947f2cc5a')]
    urllib.request.install_opener(opener)
    html_bytes = urllib.request.urlopen(url).read()
    html_string = html_bytes.decode('utf-8')
    return html_string

def strtosz(stR):                #用于在getHtml后将字符串转换为字符串数组
    sr = stR.replace('[', '').replace(']', '')
    ss = sr.split(',')
    return ss

def collect(url, path):
    html_doc = getHtml(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    block = soup.find_all('div', attrs = {"class" : "block"})

    aidL = []                        #存放aid数组
    titL = []                        #存放title数组
    urL = 'http://www.acfun.tv/content_view.aspx?contentId='  #第一条链接，可获取文章的id,title
    urLs = []                        #存放链接数组
    urll = 'http://www.acfun.tv/a/ac'
    urlls = []                       #存放文章链接数组

    for b in block:                  #找到第一条链接中可获取数据的代码段
        if '今日最热' in str(b):
            for a in b.find_all('a'):
                aidL.append(a['data-aid'])
                titL.append(a['title'])

    for l in aidL:                   #按照aid数组生成链接数组
        U = urL + l + '&channelId=110%20HTTP/1.1'
        urLs.append(U )
        u = urll + l
        urlls.append(u)

    info = []                        #存放第二条链接的数组数据
    for u in urLs:
        info.append(strtosz(getHtml(u)))#将getHtml获取到的[72111,202,0,0,0,95,3,3]格式转化为['72790', '203', '0', '0', '0', '95', '3', '3']格式，前者为字符串后者为字符串数组

#5038         |43    |0|0|0     |0     |1     |0
#------------------- |
#围观数/播放数|评论数|0|0|弹幕数|收藏数|香蕉数|0

    json_list = []
    for r in range(10):
        json_adapts = {}
        json_adapt = {"aid" : aidL[r], "title" : titL[r], "url" : urlls[r],
    "views" : info[r][0], "comments" : info[r][1], "bananas" : info[r][6]}
    
        json.dumps(json_adapt)
        json_list.append(json_adapt)

    outp = {"data" : json_list}
    jj = json.dumps(outp, ensure_ascii=False)

    now = datetime.now()
    fpath = 'C:\\apache-tomcat-7.0.73\\webapps\\test\\%s\\%s' % (path, now.strftime('%y%m%d'))
    with codecs.open(fpath, 'w+', 'utf-8') as f:
        f.write(jj)

if __name__ == '__main__':
    urls = ['http://www.acfun.tv/v/list110/index.htm', 'http://www.acfun.tv/v/list73/index.htm',
            'http://www.acfun.tv/v/list74/index.htm', 'http://www.acfun.tv/v/list75/index.htm',
            'http://www.acfun.tv/v/list164/index.htm']
    path = ['complex', 'work', 'animation', 'comic', 'game']

    for r in range(len(urls)):
        collect(urls[r], path[r])
        print(path[r] + ' DONE!')